{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a06a3bfc",
   "metadata": {},
   "source": [
    "MEASURE OF POPULISM FOR PRESIDENTIAL ELECTIONS \n",
    "\n",
    "including for different dimensions and by topic "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e7f3235",
   "metadata": {},
   "outputs": [],
   "source": [
    "# PACKAGES #############################################################\n",
    "\n",
    "import string\n",
    "import re\n",
    "import pandas as pd\n",
    "import os\n",
    "import numpy as np\n",
    "from gensim.parsing.preprocessing import remove_stopwords\n",
    "from gensim.parsing.preprocessing import stem_text\n",
    "from nltk.corpus import wordnet as wn\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from lexical_diversity import lex_div as ld\n",
    "from nltk.tokenize import word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a84cf8f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#### DICTIONARY ################################################################\n",
    "# dizionario pop\n",
    "\n",
    "pop = ['deceit','treason','betray','absurd','arrogant','promis','promise','capitul','corrupt','direct','elite', 'establishm', 'ruling', 'caste', 'class', 'mafia', 'undemocratic', 'particrat', 'politic', 'propaganda', 'referend', 'regime','shameless','shame','admit', 'tradition', 'people']\n",
    "\n",
    "\n",
    "    ###### VERSIONE 1 WORD FREQUENCY #########\n",
    "\n",
    "from nltk.corpus import wordnet as wn\n",
    "\n",
    "pop_s = [stem_text(i) for i in pop]\n",
    "\n",
    "wildcards = ''\n",
    "for x in pop_s:\n",
    "    x = x + '[a-z]*'\n",
    "    wildcards += '(' + x + ')|'\n",
    "wildcards = wildcards[:-1]\n",
    "del x\n",
    "\n",
    "\n",
    "matches = [] # complete thw wildwords\n",
    "for i, word in enumerate(wn.all_synsets()):\n",
    "    w = word.lemma_names()[0].lower()\n",
    "    if '_' in w:\n",
    "        continue\n",
    "    if re.match(wildcards, w):\n",
    "        matches.append(w)\n",
    "del i\n",
    "del w\n",
    "\n",
    "matches = list(set(matches).union(pop)) # final dictionary\n",
    "matches = list(set([stem_text(i) for i in matches]))\n",
    "\n",
    "    ## NB: Manually delete some irrelevant results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a141c0a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Virtuous people\n",
    "people = ['peopl', 'tradit', 'tradition', 'direct', 'directli', 'referendum']\n",
    "\n",
    "# Corrupt elite\n",
    "elite = ['cast', 'class', 'elit', 'elitist', 'establish',\n",
    "         'polit', 'politic', 'politician', 'corrupt', 'regim',\n",
    "         'regimen', 'rule', 'propaganda', 'directori', 'promin',\n",
    "         'arrog', 'arrogantli', 'betrai', 'treason', 'promis', 'shame',\n",
    "        'undemocrat', 'deceit', 'absurd', 'absurdli', 'admit', 'admitt']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae3a6112",
   "metadata": {},
   "outputs": [],
   "source": [
    " ### UPLOAD SPEECHES #####\n",
    "\n",
    "os.chdir('/Users/gloria/Dropbox/Progetti/Rhetoric/topics')\n",
    "trump = pd.read_csv('all_punctuated_trump.csv', sep=',', encoding='utf-8')\n",
    "trump = trump.drop('Unnamed: 0', 1)\n",
    "trump['candidate'] = 'trump'\n",
    "\n",
    "clinton = pd.read_csv('all_punctuated_clinton.csv', sep=',', encoding='utf-8')\n",
    "clinton = clinton.drop('Unnamed: 0', 1)\n",
    "clinton = clinton.drop('presidency', 1)\n",
    "clinton['candidate'] = 'clinton'\n",
    "\n",
    "df = pd.concat([trump, clinton])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e76849d",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Clean text\n",
    "\n",
    "text = list(df['text'])\n",
    "text = [i.strip(\"/\") for i in text]\n",
    "text=[re.sub(r'\\.(?! )', '. ', re.sub(r' +', ' ', t)) for t in text]  #to add spaces after full stops. in the row text many mistakes\n",
    "text=[t.replace(']', '] ') for t in text]\n",
    "text=[t.replace(':', ': ') for t in text]\n",
    "text=[re.sub(\"[\\[].*?[\\]]\", \"\", t) for t in text]\n",
    "daeliminare = ['â€”','â€“','Â','“','-','--',] # add here all words we want to eliminare\n",
    "for i in daeliminare:\n",
    "    text=[t.replace(i, ' ') for t in text]\n",
    "sep = 'APP Note:' # per eliminare i commenti a margine - le note finali dell'editore\n",
    "text = [t.split(sep, 1)[0] for t in text]\n",
    "sep = 'NOTE:'\n",
    "text = [t.split(sep, 1)[0] for t in text]\n",
    "sep = 'Citation:'\n",
    "text = [t.split(sep, 1)[0] for t in text]\n",
    "del sep\n",
    "\n",
    "text2 = [t.lower() for t in text]\n",
    "translator = str.maketrans('','',string.punctuation)\n",
    "text2 = [t.translate(translator) for t in text2]\n",
    "text2 = [''.join([i for i in item if not i.isdigit()]) for item in text2]\n",
    "text2 = [remove_stopwords(item) for item in text2]\n",
    "text2 = [item for item in text2 if len(item.split())>2]\n",
    "text2 = [stem_text(i) for i in text2]\n",
    "\n",
    "df['text2'] = text2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c929498",
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf = TfidfVectorizer(min_df=0.01,\n",
    "                        stop_words='english',\n",
    "                        use_idf=True)\n",
    "\n",
    "X_tfidf = tfidf.fit_transform(df['text2'])\n",
    "feature_names = tfidf.get_feature_names()\n",
    "\n",
    "pop_people = []\n",
    "for i in range(len(df['text2'])):\n",
    "    feature_index = X_tfidf[i,:].nonzero()[1]\n",
    "    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])\n",
    "    l = []\n",
    "    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:\n",
    "        if w in people:\n",
    "            l.append([w, s])\n",
    "        else:\n",
    "            continue\n",
    "    p = sum(n for _, n in l)\n",
    "    pop_people.append(p)\n",
    "\n",
    "\n",
    "pop_elite = []\n",
    "for i in range(len(df['text2'])):\n",
    "    feature_index = X_tfidf[i,:].nonzero()[1]\n",
    "    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])\n",
    "    l = []\n",
    "    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:\n",
    "        if w in elite:\n",
    "            l.append([w, s])\n",
    "        else:\n",
    "            continue\n",
    "    p = sum(n for _, n in l)\n",
    "    pop_elite.append(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55142660",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['elite_score'] = pop_elite\n",
    "df['people_score'] = pop_people\n",
    "\n",
    "df['pop_dim2'] = np.where( df[['people', 'elite']].all(axis=1)==0, 0, df[['people', 'elite']].sum(axis=1))\n",
    "\n",
    "# Clean dataset\n",
    "df = df[['candidate', 'place', 'time', 'pop_dim2']]\n",
    "df.to_csv('pop_pres_dim.csv')\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
